{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmarking MLDB\n",
    "\n",
    "This notebook contains the code to run \"[The Absolute Minimum Benchmark](https://github.com/szilard/benchm-ml/tree/master/z-other-tools)\" for a machine learning tool."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First we load the Python MLDB helper library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pymldb import Connection\n",
    "mldb = Connection(\"http://localhost/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next we create the datasets directly from the remote files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Datasets loaded.\n"
     ]
    }
   ],
   "source": [
    "mldb.put('/v1/procedures/import_bench_train_1m', {\n",
    "    \"type\": \"import.text\",\n",
    "    \"params\": { \n",
    "        \"dataFileUrl\": \"https://s3.amazonaws.com/benchm-ml--main/train-1m.csv\",\n",
    "        \"outputDataset\":\"bench_train_1m\",\n",
    "        \"runOnCreation\": True\n",
    "    }\n",
    "})\n",
    "\n",
    "mldb.put('/v1/procedures/import_bench_test', {\n",
    "    \"type\": \"import.text\",\n",
    "    \"params\": { \n",
    "        \"dataFileUrl\": \"https://s3.amazonaws.com/benchm-ml--main/test.csv\",\n",
    "        \"outputDataset\":\"bench_test\",\n",
    "        \"runOnCreation\": True\n",
    "    }\n",
    "})\n",
    "\n",
    "print \"Datasets loaded.\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we create the experimental setup."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ready to go!\n"
     ]
    }
   ],
   "source": [
    "mldb.put('/v1/procedures/benchmark', {\n",
    "    \"type\": \"classifier.experiment\",\n",
    "    \"params\": {\n",
    "        \"experimentName\": \"benchm_ml\",\n",
    "        \"inputData\": \"\"\"\n",
    "            select\n",
    "                {* EXCLUDING(dep_delayed_15min)} as features,\n",
    "                dep_delayed_15min = 'Y' as label\n",
    "            from bench_train_1m\n",
    "            \"\"\",\n",
    "        \"testingDataOverride\":  \"\"\"\n",
    "            select\n",
    "                {* EXCLUDING(dep_delayed_15min)} as features,\n",
    "                dep_delayed_15min = 'Y' as label\n",
    "            from bench_test\n",
    "            \"\"\",\n",
    "        \"configuration\": {\n",
    "            \"type\": \"bagging\",\n",
    "            \"num_bags\": 100,\n",
    "            \"validation_split\": 0,\n",
    "            \"weak_learner\": {\n",
    "                \"type\": \"decision_tree\",\n",
    "                \"max_depth\": 20,\n",
    "                \"random_feature_propn\": 0.3\n",
    "            }\n",
    "        },\n",
    "        \"modelFileUrlPattern\": \"file:///mldb_data/models/benchml_$runid.cls\",       \n",
    "        \"mode\": \"boolean\"\n",
    "    }\n",
    "})\n",
    "\n",
    "print \"Ready to go!\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, we run the experiment inside a timing block. On an otherwise-unloaded AWS EC2 r3.8xlarge instance (32 cores, 240GB of RAM) it takes around 20 seconds to reach an AUC of more than 0.74."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "AUC = 0.7430276746, time = 21.4008\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "result = mldb.post('/v1/procedures/benchmark/runs')\n",
    "\n",
    "run_time = time.time() - start_time\n",
    "auc = result.json()[\"status\"][\"folds\"][0][\"resultsTest\"][\"auc\"]\n",
    "\n",
    "print \"\\n\\nAUC = %0.10f, time = %0.4f\\n\\n\" % (auc, run_time)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
